In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import folium
import folium.plugins
import seaborn as sns
import json
import re
In [2]:
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 10
In [3]:
spark = (SparkSession.builder
 .master("local[*]")
 .getOrCreate())
In [4]:
spark.version
Out[4]:
u'2.1.1'
In [5]:
DATA_FOLDER = '/home/jovyan/work/data/housing'
In [6]:
houses = spark.read.parquet(DATA_FOLDER + '/hemnet').cache()
In [7]:
houses.filter(f.lower(houses.addres).like('%ankargatan%')).limit(5).toPandas()
Out[7]:
hemnetId objectType addres listPrice soldPrice ppsm lat lon livingArea rooms plotArea soldDate url state county city district suburb neighbourhood
0 560057 bostadsratt Ankargatan 9 2850000 2800000 34568.0 55.613375 12.974986 81.0 3.5 0.0 2016-08-26 /salda/bostadsratt-3,5rum-vastra-hamnen-malmo-... Skåne Malmö Malmö Norr Västra Hamnen Dockan
1 494987 bostadsratt Ankargatan 9 2595000 2625000 34539.0 55.613288 12.975118 76.0 3.0 0.0 2016-03-30 /salda/bostadsratt-3rum-vastra-hamnen-malmo-ko... Skåne Malmö Malmö Norr Västra Hamnen Dockan
2 615109 bostadsratt Ankargatan 9 2295000 2350000 35606.0 55.613304 12.975021 66.0 2.5 0.0 2016-12-20 /salda/bostadsratt-2,5rum-vastra-hamnen-malmo-... Skåne Malmö Malmö Norr Västra Hamnen Dockan
3 475697 villa Ankargatan 6 410000 400000 5714.0 56.745349 15.284065 70.0 3.0 912.0 2016-02-02 /salda/villa-3rum-lessebo-lessebo-kommun-ankar... Kronobergs län Lessebo None None None None
4 561504 bostadsratt Ankargatan 7 2700000 2725000 33978.0 59.596860 16.589209 80.2 3.0 0.0 2016-05-06 /salda/bostadsratt-3rum-oster-malarstrand-fram... Västmanlands län Västerås Västerås None Framnäs None
In [8]:
houses.printSchema()
root
 |-- hemnetId: long (nullable = true)
 |-- objectType: string (nullable = true)
 |-- addres: string (nullable = true)
 |-- listPrice: integer (nullable = true)
 |-- soldPrice: integer (nullable = true)
 |-- ppsm: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- livingArea: double (nullable = true)
 |-- rooms: double (nullable = true)
 |-- plotArea: double (nullable = true)
 |-- soldDate: string (nullable = true)
 |-- url: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- city: string (nullable = true)
 |-- district: string (nullable = true)
 |-- suburb: string (nullable = true)
 |-- neighbourhood: string (nullable = true)

Check only for relevant year
In [9]:
houses.groupBy(f.substring(houses.soldDate, 0, 4).alias('year')).count().orderBy(f.col('year')).show()
+----+-----+
|year|count|
+----+-----+
|2013|   16|
|2014|   37|
|2015|   47|
|2016|93888|
|2017| 3770|
+----+-----+

In [10]:
houses = houses.filter(f.substring(houses.soldDate, 0, 4) == '2016')
In [11]:
houses.groupBy(houses.county).count().orderBy(f.col('count').desc()).show()
+-----------+-----+
|     county|count|
+-----------+-----+
|  Stockholm|15418|
|   Göteborg| 5612|
|      Malmö| 4283|
|      Solna| 1679|
|      Nacka| 1458|
|   Västerås| 1350|
|    Uppsala| 1192|
|Helsingborg| 1157|
|   Huddinge| 1123|
|       Täby| 1119|
|  Sundsvall| 1092|
|  Norrtälje| 1084|
|       Lund| 1082|
| Norrköping| 1077|
| Sundbyberg| 1037|
|   Järfälla| 1025|
|  Jönköping| 1005|
|  Linköping| 1003|
|   Karlstad|  972|
|      Gävle|  970|
+-----------+-----+
only showing top 20 rows

Overall house market
In [12]:
def plot(houses):
    positions = (houses
     .rdd.map(lambda row : (row.lat, row.lon))
    ).collect()
    m = folium.Map(
        location=[60.128161,18.643501], 
        zoom_start=5, 
        control_scale = False, 
        tiles='stamentoner', 
        attr='USGS style'        
    )
    m.add_child(folium.plugins.HeatMap(positions, radius=10))
    return m
In [13]:
plot(houses)
Out[13]:
In [14]:
def plot_circles(houses, threshold, lon=18.643501, lat=60.128161, zoom=5):
    houses = houses.withColumn('ppsm_rel', houses.ppsm/threshold)
    data = (houses
     .filter(houses.ppsm.isNotNull())
     .rdd.map(lambda row : (row.lat, row.lon, row.ppsm_rel))
    ).collect()
    data1 = map(lambda (lon, lat, radius) : ((lon,lat), radius), data)
    cmap = plt.get_cmap('coolwarm')
    m = folium.Map(
        location=[lat,lon], 
        zoom_start=zoom, 
        control_scale = False, 
        tiles='stamentoner', 
        attr='USGS style'        
    )
    for pos, rad in data1:
        color = matplotlib.colors.to_hex(cmap(rad))
        m.add_child(folium.CircleMarker(pos, radius=3, color=None, fill_color=color, fill_opacity=0.7))
    return m
In [15]:
plot_circles(houses.filter(~houses.county.isin(['Stockholm','Göteborg','Malmö'])).sample(False,0.2), 50000)
Out[15]:
In [16]:
plot_circles(houses.filter(houses.county == 'Malmö'), 50000, 13.010559, 55.569416, 11.5)
Out[16]:
In [17]:
plot_circles(houses.filter(houses.county == 'Göteborg'),80000,11.95365,57.6763,11)
Out[17]:
In [18]:
plot_circles(houses.filter(houses.county == 'Stockholm'),120000,18.073196,59.314783,11)
Out[18]:
Total house area versus PPSM
In [16]:
houses = (houses
 .withColumn('ppsm', houses.soldPrice / houses.livingArea)
 .withColumn('ppsmls', houses.listPrice / houses.livingArea)
)
In [29]:
corr = houses.select(houses.ppsm, houses.ppsmls).toPandas()
In [30]:
sns.regplot(x="ppsm", y="ppsmls", data=corr, truncate=True, marker='+', robust=True);

Tightly correlated list price and sales price.

Correlation Matrix
In [51]:
from sklearn.preprocessing import LabelEncoder
def neat_correlation(df, target_col):
    # Get categorical values and label them
    cat_idx = df.select_dtypes(['object']).fillna('N/A').apply(LabelEncoder().fit_transform).add_suffix('_labelled')
    
    train_cat_idx = pd.concat([df, cat_idx], axis=1)
    sale_price = train_cat_idx[target_col]
    train_cat_idx = train_cat_idx.drop(target_col, 1).sort_index(axis=1)
    train_cat_idx_sorted = pd.concat([sale_price, train_cat_idx], axis=1)
    
    corrmat = train_cat_idx_sorted.corr()
    plt.subplots(figsize=(16, 16))
    mask = np.zeros_like(corrmat)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.set(font_scale=1.2)
    return sns.heatmap(corrmat, mask=mask, cmap=cmap, square=True, annot=True)
In [29]:
hpd = houses.filter(houses.county == 'Malmö').toPandas()
In [60]:
neat_correlation(hpd.drop(['listPrice','ppsm','url','hemnetId'],1), 'soldPrice');
Malmö
In [31]:
prices = (houses
 .filter(houses.county == 'Malmö')
 .filter((houses.livingArea.isNotNull()) & (houses.livingArea != 0) & (houses.livingArea >= 30) & (houses.livingArea <= 199.99))
 .withColumn('Area (<=)', (f.ceil(houses.livingArea/10)*10))
 .withColumnRenamed('ppsm', 'Sale')
 .withColumnRenamed('ppsmls', 'List')
 .withColumnRenamed('objectType', 'Type')
).toPandas()
prices_melted = pd.melt(prices, id_vars=['Area (<=)', 'Type'], value_vars=['List','Sale'], value_name='Price per m2', var_name='Price')
In [32]:
sns.violinplot(data=prices_melted, x='Area (<=)', y='Price per m2', hue='Price', scale='count', split=True);
Göteborg
In [33]:
prices = (houses
 .filter(houses.county == 'Göteborg')
 .filter((houses.livingArea.isNotNull()) & (houses.livingArea != 0) & (houses.livingArea >= 30) & (houses.livingArea <= 199.99))
 .withColumn('Area (<=)', (f.ceil(houses.livingArea/10)*10))
 .withColumnRenamed('ppsm', 'Sale')
 .withColumnRenamed('ppsmls', 'List')
 .withColumnRenamed('objectType', 'Type')
).toPandas()
prices_melted = pd.melt(prices, id_vars=['Area (<=)', 'Type'], value_vars=['List','Sale'], value_name='Price per m2', var_name='Price')
In [34]:
sns.violinplot(data=prices_melted, x='Area (<=)', y='Price per m2', hue='Price', scale='count', split=True);
Stockholm
In [35]:
prices = (houses
 .filter(houses.county == 'Stockholm')
 .filter((houses.livingArea.isNotNull()) & (houses.livingArea != 0) & (houses.livingArea >= 30) & (houses.livingArea <= 199.99))
 .withColumn('Area (<=)', (f.ceil(houses.livingArea/10)*10))
 .withColumnRenamed('ppsm', 'Sale')
 .withColumnRenamed('ppsmls', 'List')
 .withColumnRenamed('objectType', 'Type')
).toPandas()
prices_melted = pd.melt(prices, id_vars=['Area (<=)', 'Type'], value_vars=['List','Sale'], value_name='Price per m2', var_name='Price')
In [36]:
sns.violinplot(data=prices_melted, x='Area (<=)', y='Price per m2', hue='Price', scale='count', split=True);
Object type versus PPSM
In [37]:
fp = sns.factorplot(
    data=prices_melted, 
    x='Area (<=)', 
    y='Price per m2', 
    hue='Price', 
    row='Type', 
    scale='count', 
    kind='violin', 
    size=6, 
    aspect=1.5, 
    split=True, 
    sharex=False
);
axes = fp.axes
axes[0,0].set_ylim(0,100000);
/opt/conda/envs/python2/lib/python2.7/site-packages/seaborn/categorical.py:762: RuntimeWarning: invalid value encountered in double_scalars
  scaler = count / counts[i].max()
In [38]:
positions = (houses
 .filter(f.col('ppsm').isNotNull())
 .groupBy(houses.lat, houses.lon, houses.objectType, houses.county).agg(f.avg('ppsm').alias('ppsm'))
# .filter(f.col('ppsm') < 910000.0) # Filter filthy rich persons abodes
 .filter(houses.county == 'Malmö')
# .filter((houses.lon >= 12.881184) & (houses.lon <= 13.150761) & (houses.lat >= 55.500386) & (houses.lat <= 55.639951))
)
In [39]:
sns.violinplot(x='objectType', y='ppsm', scale='width', data=positions.toPandas(), figsize=(20,10));
In [40]:
sns.violinplot(x='objectType', y='ppsm', split=True, scale='count', data=positions.toPandas(), figsize=(20,10));